load("data/cleaned/data_clean.RData", verbose = TRUE)
supp <- read.csv("data/cleaned/Trans_recog_wide_anon.csv")

id_vars <- c("phase", "group_id", "ind_id", "discuss_type", "discuss_type_label", "is_listener", "is_speaker",  "listener_id", "speaker_1_id", "speaker_2_id", "arm_label",
             "treat_type_r1", "treat_type_r1_label", "treat_type_r2", "treat_type_r2_label", "treat_type_r1_pooled",
             "public", "discussant",
             "public_observer", "public_non_observer", "discussion_pair_speaker", "discussion_pair_listener", "discussion_full", "discussion_pooled",
             "video_type", "group", "group_label", "group_video",
             "stratum_id", "delivery_incentive", "delivery_incentive_exp")

# MAIN CHOICES  ---------------------------------------------------------------
r2_choices <- df %>%
  pivot_longer(
    c(matches("r2_(photo|educ|reliability|english|items|trans)_\\d_\\d$"), matches("hiring_choice_\\d$")),
    names_to = c(".value", "round"),
    names_pattern = "(.*)_(\\d$)"
  ) %>%
  mutate(round = as.numeric(round)) %>%
  mutate(ind_id_original = coalesce(ind_id_original, ind_id)) %>%
  left_join_track(
    rand_details %>% filter(round == "r2") %>% select(comparator_order_in_pair),
    by = c("ind_id_original" = "ind_id", "round" = "photo_order_pair_id"),
    suffix = c("", ".extra")
  ) %>%
  mutate(
    r2_trans = case_when(
      str_detect(r2_photo_1, "T") ~ 1L,
      str_detect(r2_photo_2, "T") ~ 2L
    ),
    r2_choose_comparator = hiring_choice == comparator_order_in_pair,
    r2_choose_trans = hiring_choice == r2_trans
  ) %>%
  mutate(hiring_choice_refuse = hiring_choice == -99) %>%


  # If hiring_choice_refuse is true (14 values), then replace r2_choose_trans with the median
  mutate(pair_includes_trans = !is.na(r2_trans)) %>%
  mutate(r2_choose_trans = as.numeric(r2_choose_trans),
         r2_choose_comparator = as.numeric(r2_choose_comparator)) %>%
  tidylog::mutate(
    r2_choose_trans = ifelse(hiring_choice_refuse & pair_includes_trans, median_na(r2_choose_trans), r2_choose_trans),
    r2_choose_comparator = ifelse(hiring_choice_refuse, median_na(r2_choose_comparator), r2_choose_comparator)
  ) %>%
  tidylog::mutate(r2_choose_comparator = ifelse(r2_choose_comparator != r2_choose_trans & !is.na(r2_choose_trans),
                                       r2_choose_trans, r2_choose_comparator)) %>%
  mutate(r2_choose_trans = as.logical(r2_choose_trans),
         r2_choose_comparator = as.logical(r2_choose_comparator)) %>%

  mutate(pair_includes_female = str_detect(r2_photo_1, "^W") | str_detect(r2_photo_2, "^W")) %>%
  mutate(
    delivery_incentive = fct_relevel(delivery_incentive, "single"),
    delivery_incentive_exp = fct_relevel(delivery_incentive_exp, "(Missing)", "single")
  ) %>%
  mutate(item_diff_fct = factor(item_diff)) %>%

  mutate(
    r2_reliability_diff = r2_reliability_2 - r2_reliability_1,
    r2_reliability_diff = ifelse(comparator_order_in_pair == 1, -r2_reliability_diff, r2_reliability_diff),
    r2_reliability_shown = !is.na(r2_reliability_diff),
    r2_reliability_benchmark = ifelse(comparator_order_in_pair == 1, r2_reliability_1, r2_reliability_2),
    r2_reliability_diff = ifelse(!r2_reliability_shown, 0, r2_reliability_diff),
    r2_reliability_benchmark = ifelse(!r2_reliability_shown, 0, r2_reliability_benchmark)
  ) %>%
  mutate(r2_choose_trans = as.numeric(r2_choose_trans)) %>%
  mutate(video_type_label = fct_recode(video_type, "Legal Rights" = "treatment", "Rights Messaging" = "placebo", "Control" = "control")) %>%
  mutate(discuss = phase == "phase_1" & group == 1 | phase == "phase_2" & discuss_type == "discussion_full") %>%
  ungroup %>%
  glimpse

# MAKE THE RI variables numeric otherwise they don't work in RI calculations
r2_choices_num <- r2_choices %>%
  mutate(group_label = as.numeric(group_label == "discuss"),
         r2_choose_comparator = as.numeric(r2_choose_comparator),
         pair_includes_trans = as.numeric(pair_includes_trans),
         video_type_control = as.numeric(video_type == "control"),
         video_type_placebo = as.numeric(video_type == "placebo"),
         video_type_treatment = as.numeric(video_type == "treatment"),
         r2_reliability_shown = as.numeric(r2_reliability_shown),
         discussion_full = as.numeric(discuss_type == "discussion_full"),
         discussion_listener = as.numeric(discuss_type == "discussion_pair" & is_listener %in% 1),
         discussion_speaker = as.numeric(discuss_type == "discussion_pair" & !(is_listener %in% 1)),
         pair_includes_trans_discussion_full = pair_includes_trans * discussion_full,
         control = as.numeric(discuss_type == "control")
  ) %>%
  mutate(
    across(c(public, public_observer, public_non_observer, discussion_pair_speaker, discussion_pair_listener, discussion_full, discussion_pooled),
           as.numeric)
  ) %>%
  mutate(
    pair_includes_trans_alt = pair_includes_trans
  ) %>%
  mutate(r2_choose_comparator = coalesce(r2_choose_trans, r2_choose_comparator)) %>%
  mutate(delivery_incentive_multi = as.numeric(delivery_incentive_exp == "multi")) %>%
  mutate(control_pooled = as.numeric(control == 1 | public_non_observer == 1))

# Group predictions ---------------------------------------------------------------
group_predic <- df %>% pivot_longer(
  c(
    matches("group_predic_(choice|photo|educ|reliability|trans|age|items)_\\d"),
    matches("group_member_know")
  ),
  names_pattern = "(group_.*)_(\\d)$",
  names_to = c(".value", "round")
) %>%
  janitor::remove_empty() %>%
  mutate(group_predic_trans = case_when(
    str_detect(group_predic_photo_1, "^T") ~ 1,
    str_detect(group_predic_photo_2, "^T") ~ 2
  )) %>%
  mutate(
    item_diff = case_when(group_predic_trans == 2 ~ group_predic_items_2 - group_predic_items_1,
                          group_predic_trans == 1 ~ group_predic_items_1 - group_predic_items_2),
    reliability_diff = case_when(
      group_predic_trans == 2 ~ group_predic_reliability_2 - group_predic_reliability_1,
      group_predic_trans == 1 ~ group_predic_reliability_1 - group_predic_reliability_2
    ),
    reliability_shown = !is.na(reliability_diff),
    reliability_diff = ifelse(!reliability_shown, 0, reliability_diff),

    reliability_benchmark = case_when(
      group_predic_trans == 2 ~ group_predic_reliability_1,
      group_predic_trans == 1 ~ group_predic_reliability_2
    ),

    reliability_benchmark = ifelse(!reliability_shown, 0, reliability_benchmark),
  ) %>%
  mutate(
    group_predic_choice = ifelse(group_predic_choice==-99, NA, group_predic_choice),
    group_predic_choose_trans = group_predic_choice == group_predic_trans
  ) %>%
  mutate(group_member_know = ifelse(group_member_know == -99, NA, group_member_know)) %>%
  group_by(group_id) %>%
  mutate(
    group_member_know_mean = mean_na(5 - group_member_know)
  )


group_predic %>% ungroup %>% count_prop(phase)
group_predic_member <- df %>%
  select(KEY, all_of(id_vars), matches("group_predic_a_or_b_\\d"),
         matches("group_predic_member_(a|b)"),
         matches("group_predic_(photo|items)_(1|2)_")) %>%

  pivot_longer(matches("_\\d"), values_transform = as.character,
               names_pattern = "(.*)_(\\d)",
               names_to = c("name", "round")) %>%
  filter(str_detect(name, "group_predic_a_or_b")) %>%
  mutate(value = case_when(
    value == "a" ~ group_predic_member_a,
    value == "b" ~ group_predic_member_b
  )) %>%
  ungroup %>%

  select(KEY, round, group_id, group_predic_member = value) %>%
  mutate(group_predic_member_id = paste0(group_id, "_", group_predic_member)) %>%
  select(KEY, round, group_predic_member_id) %>%
  # Add on whether the group predic member is a listener or a speaker
  tidylog::left_join(
    df %>% select(group_predic_member_id = ind_id,
                  group_predic_member_is_speaker = is_speaker,
                  group_predic_member_is_listener = is_listener,
                  group_predic_member_is_observer = public_observer,
                  group_predic_member_is_non_observer = public_non_observer)
  )

# Merge this onto the group_predic choices
group_predic_w_member_id <- group_predic %>%
  tidylog::full_join(group_predic_member, by = c("KEY", "round"))


# GET ACTUAL CHOICES MADE IN R2 by those people
r2_choices_for_merge <- r2_choices %>% group_by(ind_id) %>%
  select(ind_id, matches("r2_(photo|educ|reliability|english|items|trans)_\\d$"), r2_trans, r2_choose_trans, r2_choose_comparator, hiring_choice, comparator_order_in_pair) %>%
  print %>%
  nest()

# Merge onto the group_predic dataset using the group_predic_member_id
# At the predictor x round level (i.e. including all 4 predictions)
group_predic_w_actual_choices <- group_predic_w_member_id %>%
  tidylog::left_join(r2_choices_for_merge, by = c("group_predic_member_id" = "ind_id")) %>%
  unnest(data) %>%
  filter(
    group_predic_photo_1 == r2_photo_1 & group_predic_photo_2 == r2_photo_2 |
      group_predic_photo_1 == r2_photo_2 & group_predic_photo_2 == r2_photo_1
  ) %>%
  mutate(
    order_switched = group_predic_photo_1 == r2_photo_2 & group_predic_photo_2 == r2_photo_1
  ) %>%

  mutate(
    group_predic_switched = ifelse(
      order_switched, 3 - group_predic_choice, group_predic_choice
    )
  ) %>%

  mutate(correct_predic = group_predic_switched == hiring_choice,
         group_predic_choose_comparator = group_predic_switched == comparator_order_in_pair) %>%
  ungroup %>%
  count_prop(group_predic_choose_comparator) %>%
  mutate(
    group_predic_includes_trans = !is.na(group_predic_trans),
    r2_includes_trans = !is.na(r2_trans)
  ) %>%
  mutate(
    misper = group_predic_choose_comparator - r2_choose_comparator
  )

group_predic_w_actual_choices %>% count_prop(hiring_choice)

r2_choices %>% select(matches("comparator"))

group_predic_long <- group_predic_w_actual_choices %>%
  pivot_longer(
    c(group_predic_choose_comparator, r2_choose_comparator)
  ) %>%
  mutate(name = dplyr::recode(name,
                       "group_predic_choose_comparator" = "Predicted choices\n(within group)",
                       "r2_choose_comparator" = "Actual\nchoices")
                       )
group_predic_long %>% count_prop(group_predic_includes_trans)

# Leadership scale ---------------------------------------------------------------

ls <- df %>%
  filter(phase == "phase_2") %>%
  select(KEY, all_of(id_vars), matches("ls\\d_\\d"), matches("ls\\d_refuse_\\d"),
         matches("group_comp_a_or_b"),
         group_predic_member_a_gc,
         group_predic_member_b_gc) %>%
  select(-matches("\\[")) %>%
  rename_with(~ifelse(str_detect(.x, "ls\\d_\\d"), str_replace(.x, "(ls\\d)_(\\d)", "\\1_answer_\\2"), .x)) %>%
  pivot_longer(matches("ls\\d|group_comp_a_or_b"), names_pattern = "(.*)_(\\d)", names_to = c(".value", "person")) %>%
  pivot_longer(matches("ls\\d"), names_pattern = "(ls\\d)_(answer|refuse)", names_to = c("name", ".value")) %>%

  # Impute the person ID:
  mutate(ls_person_id = ifelse(group_comp_a_or_b == "a", group_predic_member_a_gc, group_predic_member_b_gc)) %>%
  mutate(ls_ind_id = str_glue("{group_id}_{ls_person_id}")) %>%


  mutate(value = coalesce(refuse, answer)) %>%
  mutate(
    rev = name %in% c("ls2", "ls6"),
    name = ifelse(rev, str_glue("{name}_REV"), name)
  ) %>%
  group_by(name) %>%
  mutate(value = ifelse(value < 0, median_na(value), value)) %>%

  group_by(KEY) %>%
  mutate(mean_val = mean(value),
         resid_val = value - mean_val,
         val_rev = ifelse(rev, 10 - (value - 0), value)) %>%
  ungroup %>% count_prop(phase, rev, value, val_rev) %>%

  group_by(KEY) %>%
  mutate(
    mean_val_pos_coded = mean(val_rev[!rev]),
    mean_val_neg_coded = mean(val_rev[rev])
  ) %>%
  mutate(
    acqui_bias = (mean_val_pos_coded - mean_val_neg_coded) / 2,
    val_acqui = val_rev - acqui_bias
  ) %>%
  ungroup

# List of members (to use for group relations below)
ls_ind_ids <- ls %>% glimpse %>%
  select(KEY, ind_id, person, person, ls_ind_id) %>%
  dups_drop() %>%
  dups_report(ind_id)

ls_wide_acqui <- ls %>% select(KEY, person, name, val_acqui) %>%
  pivot_wider(names_from = name, values_from = val_acqui) %>%
  select(-KEY, -person)

ls_loadings <- ls_wide_acqui %>%
  factor_loadings(n_factors = 1) %>%
  rename(loading = MR1)

ls_inv_cov <- ls %>% select(KEY, person, name, val_acqui) %>%
  pivot_wider(names_from = name, values_from = val_acqui) %>%
  mutate(weight = 1) %>%
  add_inverse_cov_index(-c(KEY, person), var = "ls_inv_cov", weight_var = weight)

# Get values of each sub-measure disaggregated at the individual level
ls_disagg <- ls %>%
  select(ls_ind_id, respondent_id = ind_id, name, val_rev) %>%
  dups_report(ls_ind_id, respondent_id, name) %>%
  pivot_wider(names_from = name, values_from = val_rev) %>%
  arrange(ls_ind_id, respondent_id) %>%

#   Aggregate to individual level
  group_by(ls_ind_id) %>%
  summarise(across(ls1:ls9, mean_na)) %>%
  mutate(across(ls1:ls9, list(z = z_calc_std)))

ls_scores <- ls %>%
  left_join(ls_loadings, by = c("name" = "var")) %>%
  mutate(loading = ifelse(loading >= 0.3, loading, 0)) %>%
  group_by(KEY, group_id, ind_id, person, ls_ind_id) %>%
  summarise(
    ls_score = sum_na(val_rev),
    ls_score_acqui = sum_na(val_acqui),
    ls_score_fact = sum_na(val_acqui * loading)
  ) %>%
  left_join(ls_inv_cov %>% select(KEY, person, ls_inv_cov), by = c("KEY", "person")) %>%

  # Z score
  ungroup %>%
  mutate(
    ls_score_fact_z = z_calc_std(ls_score_fact)
  )

ls_scores_ind <- ls_scores %>%
  group_by(ls_ind_id) %>%
  summarise(
    across(ls_score:ls_score_fact_z, mean_na)
  ) %>%
  left_join(ls_disagg, by = "ls_ind_id")

ls_scores_group <- ls_scores_ind %>%
  mutate(group_id = ls_ind_id %>% str_replace("_\\d", "")) %>%
  group_by(group_id) %>%
  mutate(
    across(ls_score:ls_score_fact_z, list(group = mean_na,
                                          group_excl = mean_exclude)
    )
  ) %>%
  ungroup %>%
  mutate(
    high_ls_score_group = ls_score_fact_z_group > median_na(ls_score_fact_z_group),
    high_ls_score_others = ls_score_fact_z_group_excl > median_na(ls_score_fact_z_group_excl),
    higher_ls_score_others = ls_score_fact_z_group_excl > ls_score_fact_z
  )

r2_with_ls <- r2_choices_num %>%
  filter(phase == "phase_2") %>%
  tidylog::left_join(ls_scores_group, by = c("ind_id" = "ls_ind_id", "group_id")) %>%
  ungroup

r2_with_ls_listener_vs_control <- r2_with_ls %>%
  filter(
    discuss_type == "control" | (discuss_type == "discussion_pair" & is_listener == 1)
  ) %>%
  mutate(is_listener = as.logical(ifelse(is.na(is_listener), 0, is_listener))) %>%
  filter(phase == "phase_2")

# Group relations ---------------------------------------------------------------
# group_predic_member
# PHASE 1:
group_relations_phase_1 <- df %>%
  filter(phase == "phase_1") %>%
  select(KEY,
         all_of(id_vars),
         # ind_id,
         matches("group_member_know"), matches("group_member_relation")) %>%
  pivot_longer(matches("group_member_know|group_member_relation"),
               names_pattern = "(.*)_(\\d)", names_to = c(".value", "round")) %>%
  filter(round %in% c(1, 3)) %>% # only rounds 1 and 3 elicit the group member relations
  tidylog::left_join(group_predic_member, by = c("KEY", "round")) %>%
  rename(group_relation_ind_id = group_predic_member_id)

group_relations_phase_2 <- df %>%
  filter(phase == "phase_2") %>%
  select(
    KEY,
    all_of(id_vars),
    matches("group_member_know"), matches("group_member_relation"),
    matches("gc\\d")
  ) %>%
  pivot_longer(matches("group_member_know|group_member_relation|gc\\d"),
               names_pattern = "(.*)_(\\d)", names_to = c(".value", "round")) %>%
  glimpse %>%
  filter(round %in% c("1", "2")) %>%
  count_nas() %>%
  tidylog::left_join(ls_ind_ids, by = c("KEY", "ind_id", "round" = "person")) %>%
  rename(group_relation_ind_id = ls_ind_id) %>%
  glimpse

group_relations <- bind_rows(group_relations_phase_1, group_relations_phase_2) %>%
  # Missing values for group_member_know
  mutate(
    group_member_know_missing = is.na(group_member_know) | group_member_know < 0,
    group_member_know = ifelse(group_member_know_missing, NA, group_member_know)
  ) %>%

  # Reverse code group_member_know
  mutate(
    group_member_know = 5 - group_member_know
  ) %>%

  # Create variables from group_member_relation
  mutate(
    group_member_relation_list = split_multiple(group_member_relation),
    close_family = map_lgl(group_member_relation_list, ~ any(c("3", "4", "5", "6", "7") %in% .x)) |
      (!is.na(group_member_relation_other) & str_detect(group_member_relation_other, "Akka|Atthai|Athai|Chinna mamiyar|Chiththi|Sister|Sister marumagal")),
    other_family = map_lgl(group_member_relation_list, ~ any(c("8") %in% .x)) |
      (!is.na(group_member_relation_other) & str_detect(group_member_relation_other, "Periyamma|Cousin's sister")),
    neighbour = in_list("1", group_member_relation_list) |
      (!is.na(group_member_relation_other) & str_detect(group_member_relation_other, "Neighbour|Next street")),
    friend = in_list("2", group_member_relation_list) |
      (!is.na(group_member_relation_other) & str_detect(group_member_relation_other, "Friend")),
  ) %>%

  #   Clean the gc things from phase 2
  tidylog::mutate(
    across(c(gc3, gc4, gc5, gc6, gc7), list(missing = ~ .x < 0 | is.na(.x))),
    across(c(gc3, gc4, gc5, gc6, gc7), ~ ifelse(.x < 0, NA, .x))
  ) %>%
  mutate(
    across(c(group_member_know, close_family, other_family, neighbour, friend, gc3, gc4, gc5, gc6, gc7), list(z = z_calc_std))
  ) %>%
  mutate(
    any_family = close_family | other_family
  )


# Get a individual-level version for balance table
group_relations_ind <- group_relations %>%
  group_by(group_id, ind_id) %>%
  summarise(
    across(c(any_family, neighbour, friend), sum_na)
  )
group_relation_vars <- c("group_member_know", "close_family", "other_family", "neighbour", "friend",
                         "gc3", "gc4", "gc5", "gc6", "gc7") %>%
  paste0("_z")

group_relations_loadings <- group_relations %>% select(all_of(group_relation_vars)) %>% factor_loadings(n_factors = 1) %>%
  rename(loading = MR1)

group_relation_labs <- tibble(
  var_i = c("1a", "1b", "1c", "1d", "2", "3", "4", "5", "6", "7"),
  var = c("neighbour_z", "friend_z", "close_family_z", "other_family_z",
          "group_member_know_z", "gc3_z", "gc7_z", "gc4_z", "gc5_z", "gc6_z"),
  label = c("Neighbour (=1)", "Friend (=1)", "Close family (=1)", "Other family (=1)",
            "How well do they know?", "How long have they known?", "Frequency: talking",
            "Frequency: asking advice", "Frequency: asking recommendations",
            "Frequency: tell secrets")
)

group_relations_loadings %>%
  left_join(group_relation_labs, by = "var") %>%
  arrange(var_i) %>%
  select("ID" = var_i, "Question" = label, "Loading" = loading) %>%
  xtable(digits = 2) %>%
  print(include.rownames = FALSE, file = "outputs/tables/group_relations_loadings.tex", floating = FALSE)

group_relations %>%
  filter(!is.na(gc7)) %>%
  count_prop(gc7) # 62.6% people speak to each other EVERY DAY

group_relations %>%
  filter(!is.na(gc6)) %>%
  count_prop(gc6) # 34.4% people ever tell secrets to other person

# At the rater x ratee level
relations_scores <- group_relations %>%
  pivot_longer(all_of(group_relation_vars), names_to = "var", values_to = "value") %>%
  left_join(group_relations_loadings, by = "var") %>%
  mutate(loading = ifelse(abs(loading) >= 0.3, loading, 0)) %>%
  group_by(KEY, group_id, ind_id, group_relation_ind_id) %>%
  summarise(
    group_relation_score_fact = mean_na(value * loading)
  ) %>%
  ungroup %>%
  mutate(
    group_relation_score_fact_z = z_calc_std(group_relation_score_fact)
  )

# At the rater level
relations_scores_ind <- relations_scores %>%
  group_by(KEY, ind_id) %>%
  summarise(
    relation_score_fact_z_ind_others = mean_na(group_relation_score_fact_z)
  ) %>%
  ungroup %>%
  mutate(
    close_knit_ind_others = relation_score_fact_z_ind_others > median_na(relation_score_fact_z_ind_others)
  )

# At the group level
relations_scores_group <- relations_scores %>%
  group_by(group_id) %>%
  summarise(
    relation_score_fact_z_group = mean_na(group_relation_score_fact_z)
  ) %>%
  ungroup %>%
  mutate(
    close_knit_group = relation_score_fact_z_group > median_na(relation_score_fact_z_group)
  )

r2_with_relations <- r2_choices_num %>%
  tidylog::left_join(
    relations_scores_group, by = "group_id"
  ) %>%
  tidylog::left_join(
    relations_scores_ind, by = c("KEY", "ind_id")
  ) %>%
  tidylog::left_join(ls_scores_group, by = c("ind_id" = "ls_ind_id", "group_id")) %>%
  mutate(
    is_listener = ifelse(is.na(is_listener), 0, is_listener)
  ) %>%
  mutate(across(c(higher_ls_score_others, high_ls_score_others, close_knit_ind_others),
                as.numeric)) %>%
  ungroup

# Control vars df ---------------------------------------------------------------

# r1_choices$comparator_order_in_pair
control_vars_df <- df %>%
  left_join(group_relations_ind, by = c("ind_id", "group_id")) %>%
  select(ind_id, any_of(unname(control_vars)))

# List experiment ---------------------------------------------------------------
list_exp <- df %>%
  mutate(group = ifelse(group == 0, "ind", "group")) %>%

  select(all_of(id_vars), ind_id, phase, group_id, video_type, matches("video_type"), discuss_type, discuss_type_label, group, group_label, stratum_id, group_video, arm_label, matches("list_group"), matches("list_order_first"), matches("list_\\d_answer_\\d$")) %>%
  pivot_longer(
    matches("list_\\d_answer_\\d$"),
    names_pattern = c("(list_\\d)_answer_\\d"),
    names_to = "list_type",
    values_to = "list_answer"
  ) %>%
  filter(!is.na(list_answer)) %>%
  mutate(
    list_type_i = as.numeric(str_match(list_type, "list_(\\d)")[, 2]),
    list_b = as.numeric(list_type_i == 2),
  ) %>%
  mutate(arm_label = fct_relevel(arm_label, "control+no_discuss")) %>%
  mutate(trans_in_list_group = as.numeric(list_group == list_type_i)) %>%
  relocate(list_b, .after = everything()) %>%
  mutate(trans_in_list_group_group_labeldiscuss = trans_in_list_group * as.numeric(group == "group")) %>%

  tidylog::left_join(control_vars_df) %>%
  mutate(list_answer = as.numeric(list_answer)) %>%
  print

# Laws / attitudes ---------------------------------------------------------------

law_measures <- c("k2_strict", "k2.2_n_rights_do", "k4_illegal", "k6.2_illegal")
law_measures_z <- paste0(law_measures, "_z")

laws_atts <- df %>%
  mutate(k2 = ifelse(k2 == -98 | k2 == -99, 0, k2)) %>%

  mutate(
    k2.2_list = split_multiple(k2.2),
    k2.2_correct = map_lgl(k2.2_list, ~ any(2:9 %in% .x)),
    k2.2_n_rights = map_int(k2.2_list, ~ sum(2:9 %in% .x)),
    k2.2_vote = map_lgl(k2.2_list, ~ 13 %in% .x),
    k2.2_other_yn = map_lgl(k2.2_list, ~ -97 %in% .x)
  ) %>%

  mutate(
    k2.2_should_do_list = str_split(k2.2_should_do, " "),
    k2.2_should_do_2 = map_lgl(k2.2_should_do_list, \(x) "2" %in% x)
  ) %>%
  mutate(k2_strict = k2 == 1 & k2.2_correct & k2.2_should_do_2) %>%
  mutate(
    k2.2_n_rights_do = k2.2_n_rights * as.numeric(k2.2_should_do_2)
  ) %>%

  mutate(
    k3_wrong = k3 == 1 | k3 == 102,
    k4_illegal = k4 == 1,
    k5_sued = k5 == 1
  ) %>%
  mutate(
    k6.1_wrong = k6.1 == 1 | k6.1 == 102,
    k6.2_illegal = k6.2 == 1,
    k6.3_sued = k6.3 == 1
  ) %>%
  mutate(
    across(
      all_of(law_measures),
      list(z = ~ z_calc_control(.x, .x[video_type == "control"]))
    )
  ) %>%
  mutate(attitudes = (k3_wrong + k6.1_wrong)/2) %>%

  mutate(weight = 1) %>%
  add_inverse_cov_index(
    all_of(law_measures_z),
    var = "law_index_z",
    weight_var = weight
  )


laws_atts %>% filter(video_type == "control") %>% pull(law_index_z) %>% mean_na()

# Make long version of dataset for attitudes questions
atts_long <- laws_atts %>%
  pivot_longer(c("k3_wrong", "k6.1_wrong"), names_to = "attitude_type", values_to = "attitude_val") %>%
  tidylog::left_join(control_vars_df, by = "ind_id", suffix = c("", "_control"))

# Beliefs about reliability ---------------------------------------------------------------

likelihood <- df %>%
  mutate(trans_photo_index = case_when(
    str_detect(worker_rating_1_photo_actual, "T") ~ "f7.1",
    str_detect(worker_rating_2_photo_actual, "T") ~ "f7.2",
  )) %>%
  select(-matches("f7.*label")) %>%
  pivot_longer(matches("f7\\.\\d"),
               names_to = "likelihood_order") %>%
  mutate(
    trans_photo_yn = as.numeric(trans_photo_index == likelihood_order)
  ) %>%
  mutate(value = ifelse(value < 0, median_na(value), 6 - value)) %>%
  mutate(likely = value >= 4)

# Norms - general ---------------------------------------------------------------
sn <- df %>%
  select(-matches("\\["), -matches("_show_")) %>%
  mutate(n3 = coalesce(n3_right, n3_left)) %>%
  mutate(n2 = coalesce(n2_right, n2_left)) %>%
  mutate(n1 = coalesce(n1_right, n1_left)) %>%

  mutate(
    n3_items = norms_5_items
  ) %>%

  mutate(across(c(n1, n2, n3), ~ifelse(.x == -99, median_na(n3), .x)),

         # Convert to proportions
         across(c(n1, n2, n3), ~.x/20))

# R1 choices ---------------------------------------------------------------
df %>% dups_report(ind_id)

r1_choices_main <- df %>%
  tidylog::filter(choosing_only != 1 | is.na(choosing_only)) %>%
  select(
    # pid, id, video_type, group, psu_id, b2_label,
    all_of(id_vars), group_label, group_role, group_role_label, public, discussant,
    public_observer, public_non_observer, discussion_pair_speaker, discussion_pair_listener, discussion_full,
    matches("^r1_(discuss|no_discuss)_(photo|educ|reliability|english|items|trans)_(\\d)_\\d"),
    matches("group_choice(_backup)?_\\d"),
    matches("ind_choice_\\d")
  ) %>%
  select(-matches("duration")) %>%

  pivot_longer(
    -c(all_of(id_vars), group_label, group_role, group_role_label),
    names_to = c("name", "round"),
    names_pattern = "(.*)_(\\d$)",
    values_transform = as.character
  ) %>%
  # FOR LISTENERS: remove the r1_discuss rows
  tidylog::filter(!(str_detect(name, "r1_discuss|group_choice") & is_listener == 1) | is.na(is_listener)) %>%

  mutate(name = name %>% str_replace("r1_(no_)?discuss", "r1")) %>%
  mutate(name = name %>% str_replace("group_choice_backup", "group_choice")) %>%
  filter(!is.na(value)) %>%

  pivot_wider(
    names_from = name,
    values_from = value
  ) %>%
  mutate(
    across(r1_educ_1:r1_trans_1, as.numeric),
    across(r1_educ_2:group_choice, as.numeric),
  ) %>%
  mutate(round = as.numeric(round)) %>%


  mutate(ind_id_for_join = ind_id %>% str_replace_all("_alt", "")) %>%
  tidylog::left_join(
    rand_details %>% filter(str_detect(round, "r1")) %>%
      filter(!(is_listener == 1 & str_detect(round, "r1_discuss")) | is.na(is_listener)) %>%
      mutate(round = "r1") %>%
      ungroup %>%
      select(-is_listener, -is_speaker),
    by = c("ind_id_for_join" = "ind_id", "round" = "photo_order_pair_id")
  ) %>%

  mutate(r1_choice = coalesce(group_choice, ind_choice)) %>%

  mutate(
    r1_trans = case_when(
      str_detect(r1_photo_1, "T") ~ 1L,
      str_detect(r1_photo_2, "T") ~ 2L
    ),
    r1_choose_comparator = r1_choice == comparator_order_in_pair,
    r1_choose_trans = r1_choice == r1_trans
  ) %>%
  mutate(group_label = fct_relevel(group_label, "no_discuss"))

r1_choices_main %>%
  filter(is_listener %in% 1 | discuss_type == "control") %>%
  bar_chart(y = r1_choose_trans, x = discuss_type)

r1_choices_main %>% dups_report(ind_id)

r1_choices_main %>% count_prop(treat_type_r1)

r1_choices_silent_choices <- df %>%
  filter(choosing_only == 1, group_role == 1) %>%
  select(all_of(id_vars), group_role,

         matches("group_choice_silent(_backup)?_\\d_\\d"),
         matches("group_choice_silent_id"),
         matches("group_choice_silent_name"), matches("silent_backup_name"),
         matches("^r1_(discuss|no_discuss)_(photo|educ|reliability|english|items|trans)_(\\d)_\\d")) %>%
  select(-matches("\\[")) %>%

  # rename so that person comes first, then pair
  rename_with(~ str_replace(.x, "group_choice_silent_(name_|id_)?(\\d)_(\\d)", "group_choice_silent_\\1\\3_\\2")) %>%

  # Make one row per round
  pivot_longer(
    -c(any_of(id_vars), group_role),
    names_to = c(".value", "round"),
    names_pattern = "(.*)_(\\d$)",
    values_transform = as.character
  ) %>%

  # Make one row per person
  pivot_longer(
    matches("group_choice_silent"),
    names_pattern = "(.*)_(\\d$)",
    names_to = c(".value", "person")
  ) %>%
  rename(silent_choice = group_choice_silent,
         silent_choice_name = group_choice_silent_name,
         silent_choice_id = group_choice_silent_id) %>%
  select(-group_choice_silent_backup) %>%
  select(-ind_id) %>%
  mutate(
    ind_id = paste0(group_id, "_", silent_choice_id)
  ) %>%
  rename_with(~ str_replace(.x, "r1_discuss", "r1")) %>%
  relocate(ind_id) %>%

  mutate(
    across(r1_educ_1:r1_trans_1, as.numeric),
    across(r1_educ_2:r1_trans_2, as.numeric),
    across(c(person, silent_choice, silent_choice_id, round), as.numeric),
  ) %>%
  select(-matches("r1_no_discuss")) %>%
  arrange(ind_id, round) %>%

  left_join(
    rand_details %>% filter(str_detect(round, "r1")) %>% mutate(round = "r1") %>%
      ungroup %>%
      select(-is_listener, -is_speaker),
    by = c("ind_id", "round" = "photo_order_pair_id")
  ) %>%

  mutate(r1_choice = silent_choice) %>%

  mutate(
    r1_trans = case_when(
      str_detect(r1_photo_1, "T") ~ 1L,
      str_detect(r1_photo_2, "T") ~ 2L
    ),
    r1_choose_comparator = r1_choice == comparator_order_in_pair,
    r1_choose_trans = r1_choice == r1_trans
  ) %>%
  mutate(group_label = fct_relevel(group_label, "no_discuss"))

r1_choices <- bind_rows(
  r1_choices_main,
  r1_choices_silent_choices
) %>%

  mutate(
    educ_diff = ifelse(comparator_order_in_pair == 2, r1_educ_2 - r1_educ_1, r1_educ_1 - r1_educ_2),
    english_diff = ifelse(comparator_order_in_pair == 2, r1_english_2 - r1_english_1, r1_english_1 - r1_english_2)
  ) %>%
  mutate(
    educ_included = !is.na(educ_diff),
    english_included = !is.na(english_diff),
    educ_diff = ifelse(educ_included, educ_diff, 0),
    english_diff = ifelse(english_included, english_diff, 0)
  ) %>%
  mutate(group_label = as.numeric(group_label == "discuss"),
         r1_choose_comparator = as.numeric(r1_choose_comparator),
         pair_includes_trans = as.numeric(pair_includes_trans),
         video_type_placebo = as.numeric(video_type == "placebo"),
         video_type_treatment = as.numeric(video_type == "treatment"),
         discussion_full = as.numeric(discuss_type == "discussion_full"),
         pair_includes_trans_discussion_full = pair_includes_trans * discussion_full,
         control = as.numeric(discuss_type == "control"),
         public = as.numeric(public),
         discussant = as.numeric(discussant)
  ) %>%
  mutate(
    pair_includes_trans_alt = pair_includes_trans
  ) %>%
  tidylog::left_join(control_vars_df, by = "ind_id") %>%
  tidylog::left_join(group_relations_ind, by = c("ind_id", "group_id"))
r2_choices

# Number of times r1 (amalgamate to group level)
r1_n_choose_trans_group <- r1_choices %>%
  filter(group_role == 1,
         discuss_type %in% c("discussion_pair", "discussion_full")) %>%
  group_by(group_id, stratum_id, discuss_type) %>%
  arrange(group_id) %>%
  filter(as.numeric(pair_includes_trans) == 1) %>%
  summarise(n_r1_choose_trans_group = sum_na(r1_choose_trans),
            item_diff_mean = mean_na(item_diff),
            quality_diff_mean = mean_na(quality_diff),
            educ_diff_mean = mean_na(educ_diff),
            english_diff_mean = mean_na(english_diff)
  ) %>%
  ungroup %>%

  # Get at the individual level
  left_join(
    r2_choices %>% select(ind_id, group_id) %>% distinct(),
    by = "group_id"
  ) %>%
  relocate(group_id, ind_id) %>%
  mutate(p_r1_choose_trans_group = n_r1_choose_trans_group / 2) %>%
  print

r1_n_choose_trans_ind <- r1_choices %>%
  filter(
    discuss_type %in% c("control", "choosing_only")
  ) %>%
  group_by(group_id, ind_id, public_non_observer, public_observer, stratum_id, discuss_type) %>%
  filter(as.numeric(pair_includes_trans) == 1) %>%
  summarise(n_r1_choose_trans_ind = sum_na(r1_choose_trans),
            item_diff_mean = mean_na(item_diff),
            quality_diff_mean = mean_na(quality_diff),
            educ_diff_mean = mean_na(educ_diff),
            english_diff_mean = mean_na(english_diff)
  ) %>%
  ungroup %>%
  group_by(group_id) %>%
  mutate(n_r1_choose_trans_group_all = sum_na(n_r1_choose_trans_ind),
         n_r1_choose_trans_group_excl = n_r1_choose_trans_group_all - n_r1_choose_trans_ind,
         p_r1_choose_trans_group_excl = n_r1_choose_trans_group_excl / 4)

r1_n_choose_trans <- bind_rows(
  r1_n_choose_trans_group,
  r1_n_choose_trans_ind
) %>%
  mutate(n_r1_choose_trans = coalesce(n_r1_choose_trans_group,
                                      n_r1_choose_trans_ind))

r2_with_r1 <- r2_choices %>%
  tidylog::left_join(r1_n_choose_trans)

r1_choices %>% dups_report(group_id, round)
r1_choices %>% filter(group_id == "04F001") %>%
  glimpse

r1_group_id <- r1_choices %>%
  filter(discuss_type == "discussion_full" | discuss_type == "discussion_pair") %>%
  filter(group_role == 1) %>%
  dups_report(group_id, round)

# Discussion observations ---------------------------------------------------------------

discuss_obs_phase1 <- df %>%
  filter(phase == "phase_1") %>%
  # count_prop(group_role, group_role_label) %>%
  filter(group_role == 2) %>%
  filter(discuss_type %in% c("discussion_pair", "discussion_full")) %>%

  select(
    all_of(id_vars), group_role,
    matches("group_obs_trans_included"),
    matches("gobs\\d"),
    -matches("\\[")
  ) %>%
  pivot_longer(
    c(
      matches("group_obs_trans_included"),
      matches("gobs[0-6].?.?"),
      matches("gobs[0-6].?.?_label")
    ),
    names_pattern = "(group_obs_trans_included|gobs\\d.*|gobs\\d.?.?_label)_(\\d)",
    names_to = c(".value", "round")
  ) %>%
  mutate(across(c(gobs5, gobs5.2), ~ifelse(.x == -98, 0, .x))) %>%

  mutate(group_obs_trans_included = as.logical(group_obs_trans_included)) %>%
  # CREATE Z-score for gobs6
  mutate(
    gobs6_non_trans_mean = mean_na(gobs6[!group_obs_trans_included]),
    gobs6_non_trans_sd = sd(gobs6[!group_obs_trans_included], na.rm = TRUE),
  ) %>%
  mutate(
    gobs6_z = (gobs6 - gobs6_non_trans_mean) / gobs6_non_trans_sd
  ) %>%
  print

discuss_obs_phase1 %>% glimpse
# STAYED SAME gobs8, gobs9

discuss_obs_phase2 <- df %>%
  filter(phase == "phase_2") %>%
  filter(group_role == 2) %>%
  filter(discuss_type %in% c("discussion_pair", "discussion_full")) %>%
  select(
    all_of(id_vars), group_role, speaker_1_id, speaker_2_id,
    matches("group_obs_trans_included"),
    matches("gobs(8|9)"),
    matches("gobsv2_\\d"),
    -matches("\\[")
  ) %>%
  pivot_longer(
    c(
      matches("group_obs_trans_included"),
      matches("gobsv2_\\d(.*_label)?")
    ),
    names_pattern = "(group_obs_trans_included|gobsv2_\\d|gobsv2_\\d_full|gobsv2_\\d_pair|gobsv2_6_other|gobsv2_\\d.*_label)_(\\d)",
    names_to = c(".value", "round")
  ) %>%
  mutate(group_obs_trans_included = as.logical(group_obs_trans_included)) %>%

  # Add person ID onto the pairwise speaker observations, instead of the speaker ID
  mutate(
    across(
      c(gobsv2_1_pair, gobsv2_3_pair),
      ~ .x %>% as.character() %>%
        str_replace("1", as.character(speaker_1_id)) %>%
        str_replace("2", as.character(speaker_2_id))
    )
  ) %>%
  mutate(gobsv2_1_pair = as.numeric(gobsv2_1_pair)) %>%
  mutate(
    gobsv2_4_pair_who = case_when(
      gobsv2_4_pair <= 2 ~ speaker_1_id,
      gobsv2_4_pair >= 4 ~ speaker_2_id,
      gobsv2_4_pair == 3 ~ 0,
      TRUE ~ gobsv2_4_pair
    )
  )

# Write coalesce function for list-columns
coalesce_lists <- function(x, y) {
  map2(x, y, ~ {
    if (length(.x) == 1 && is.na(.x)) {
      if (length(.y) == 1 && is.na(.y)) {
        NA
      } else {
        .y
      }
    } else {
      .x
    }
  }
  )
}

# Import harmonisation table for reasons
harmonise_reasons <- read_csv("data/raw/harmonise_reasons.csv") %>%
  relocate(reason_val_phase_1, reason_text_phase_1) %>%
  filter(!is.na(reason_val_phase_1))

harmonise_recode_vals <- harmonise_reasons$reason_val_phase_2 %>%
  set_names(harmonise_reasons$reason_val_phase_1)

harmonise_recode_text <- harmonise_reasons$reason_text_phase_2 %>%
  set_names(harmonise_reasons$reason_text_phase_1)

# HARMONISE across both phases
discuss_obs <- bind_rows(
  discuss_obs_phase1,
  discuss_obs_phase2
) %>%

  mutate(
    spoke_first = coalesce(
      gobs1, gobsv2_1_full, gobsv2_1_pair
    ),
    dominant = coalesce(
      gobs2, gobsv2_4_full, as.character(gobsv2_4_pair_who)
    ),
  ) %>%
  count_prop(phase, discuss_type, dominant) %>%

  # Harmonise the reasons:
  mutate(gobs3_list = split_multiple(gobs3)) %>%
  mutate(gobs3_list_recode = map(gobs3_list, ~dplyr::recode(.x, !!!harmonise_recode_vals))) %>%

  mutate(gobs3_label_list = gobs3_label %>% str_split(";")) %>%
  mutate(gobs3_label_list_recode = map(gobs3_label_list, ~dplyr::recode(.x, !!!harmonise_recode_text))) %>%

  mutate(gobsv2_2_list = split_multiple(gobsv2_2)) %>%
  mutate(gobsv2_2_label_list = gobsv2_2_label %>% str_split(";")) %>%

  # Combine the two reasons lists
  mutate(reasons_val_list = coalesce_lists(gobs3_list_recode, gobsv2_2_list)) %>%
  mutate(reasons_text_list = coalesce_lists(gobs3_label_list_recode, gobsv2_2_label_list)) %>%

  mutate(amount_discussed = coalesce(gobs6, gobsv2_5)) %>%

  # CREATE Z-score for amount_discussed
  mutate(
    amount_discussed_non_trans_mean = mean_na(amount_discussed[!group_obs_trans_included & discuss_type == "discussion_full"]),
    amount_discussed_non_trans_sd = sd(amount_discussed[!group_obs_trans_included & discuss_type == "discussion_full"], na.rm = TRUE),
  ) %>%
  mutate(
    amount_discussed_z = (amount_discussed - amount_discussed_non_trans_mean) / amount_discussed_non_trans_sd
  ) %>%

  # Infer negative mentions from reasons for first part of phase 2
  mutate(neg_mentions_inferred = ifelse(
    phase == "phase_2" & group_obs_trans_included & is.na(gobs5.2),
    map_lgl(reasons_val_list, ~sum(.x %in% c(22:27))>0),
    NA
  )) %>%

  # Infer pos mentions
  mutate(gobsv2_3 = coalesce(gobsv2_3_full, gobsv2_3_pair)) %>% # combine full and pair in phase 2
  mutate(gobsv2_3_list = gobsv2_3 %>% str_split(" ")) %>%

  mutate(pos_mentions_inferred = ifelse(
    phase == "phase_2" & group_obs_trans_included,
    map_lgl(gobsv2_3_list, ~sum(.x %in% 1:3) > 0), # at least 1 person mentions positive
    NA
  )) %>%

  mutate(
    pos_mentions = coalesce(as.logical(gobs5), pos_mentions_inferred),
    neg_mentions = coalesce(as.logical(gobs5.2), neg_mentions_inferred)
  ) %>%

  mutate(
    who_spoke_pro_trans = coalesce(gobsv2_3, gobs5.1)
  ) %>%

  # Impute missing values from when gobs5.1 was missing:
  tidylog::mutate(who_spoke_pro_trans = case_when(
    gobs5 %in% 0 & is.na(who_spoke_pro_trans) ~ "0",
    gobs5 %in% 1 & is.na(who_spoke_pro_trans) ~ "1 2 3",
    TRUE                                      ~ who_spoke_pro_trans
  )) %>% 
  mutate(n_who_spoke_pro_trans = who_spoke_pro_trans %>% str_split(" ") %>% map_int(length),
  n_who_spoke_pro_trans = if_else(who_spoke_pro_trans == "0", 0, n_who_spoke_pro_trans),
  n_who_spoke_pro_trans = if_else(is.na(who_spoke_pro_trans) | who_spoke_pro_trans == "-98", NA_integer_, n_who_spoke_pro_trans),
  p_spoke_pro_trans = n_who_spoke_pro_trans / 3) %>% 

  # gobs9: how pro did they seem in discussion?
  mutate(across(c(gobs9_1, gobs9_2, gobs9_3),
                ~ dplyr::recode(.x, `1` = -2, `2` = -1, `3` = 1, `4` = 2, `50` = 0, `0` = 0))) %>%
  mutate(discussion_full = as.numeric(discuss_type == "discussion_full"))

discuss_obs_per_group <- discuss_obs %>%
  group_by(across(c(all_of(id_vars), gobs8, discussion_full))) %>%
  summarise(p_pos_mentions = mean_na(pos_mentions),
            p_neg_mentions = mean_na(neg_mentions),
            amount_discussed = mean_na(amount_discussed)) %>%
  ungroup

# 1. Get list of most common narratives
# 2. Create indicator variarble for whether narrative was mentioned at all for trans pair
# 3. Use as heterogneity variable in regression

# list of all narratives
narrative_list <- discuss_obs %>%
  select(group_id, round, group_obs_trans_included, reasons_text_list, reasons_val_list) %>%
  unnest(c(reasons_text_list, reasons_val_list)) %>%
  mutate(present = TRUE) %>%
  complete(nesting(group_id, round, group_obs_trans_included), nesting(reasons_text_list, reasons_val_list)) %>%
  mutate(present = ifelse(is.na(present), FALSE, present)) %>%

  group_by(group_obs_trans_included, reasons_text_list, reasons_val_list) %>%
  summarise(
    n = sum_na(present),
    p = mean_na(present)
  )

# Keep 10 most common narratives
common_narratives <- narrative_list %>%
  filter(group_obs_trans_included == TRUE) %>%
  arrange(desc(p)) %>%
  ungroup %>%
  filter(row_number() <= 10) %>%
  print

# Keep names of variables for each narrative
narrative_vars <- common_narratives %>% pull(reasons_val_list) %>%
  paste0("reasons_val_list_", .) %>%
  subset(., . != "reasons_val_list_-97")

r1_w_discuss_obs <- r1_choices %>%
  tidylog::left_join(
    discuss_obs %>% mutate(round = as.numeric(round)),
    by = c("group_id", "round"),
    suffix = c("", "_discuss_obs_extra")
  )

# Get an indicator at the group-ID level for whether a narrative was mentioned
# (For trans only)
# and also whether narrative was mentioned while selecting the trans person

narrative_indicators <- r1_w_discuss_obs %>%
  filter(discuss_type %in% c("discussion_full")) %>%
  filter(group_obs_trans_included) %>%
  expand_select_multiple(reasons_val_list) %>%

  group_by(group_id) %>%
  summarise(
    across(all_of(narrative_vars),
           list(p = ~ mean_na(.x),
                p_choose = ~mean_na(.x * r1_choose_trans)))
  )

r2_with_narrative_indicators <- r2_choices %>%
  left_join(narrative_indicators, by = "group_id")

# Create a dataframe with one row per group observation
discuss_obs_per_group_phase2 <- discuss_obs_per_group %>%

  # CREATE Z-score for overall amount discussed (gobs8)
  mutate(
    gobs8_mean = mean_na(gobs8[discuss_type == "discussion_full"]),
    gobs8_sd = sd(gobs8[discuss_type == "discussion_full"], na.rm = TRUE),
    gobs8_z = (gobs8 - gobs8_mean) / gobs8_sd
  )
# Creating a sample dataframe
# Choosing only observations ---------------------------------------------------------------

discuss_obs_choosing_only <- df %>%
  filter(discuss_type == "choosing_only", group_role == 2) %>%
  select(group_id, group_role, discuss_type, matches("gobs_choosing")) %>%
  select(-matches("\\[")) %>%
  mutate(
    participants_saw = gobs_choosing0 / 4,
    participants_spoke = gobs_choosing1,
    participants_spoke_what = gobs_choosing2_label,
    participants_spoke_what_other = gobs_choosing2_other
  ) %>%
  # view()
  mutate(
    participants_spoke_about_option = str_detect(participants_spoke_what, "A / B") |
      str_detect(participants_spoke_what_other, vars_to_regex(c(
        "3 vathi jodila oru person pathu enga uru paiyanu sonnanga",
        "Evanga yenna Transgender ahh nu santhama kettuttanga",
        "Person pakka bayama iruku"
      )))
  ) %>%
  mutate(participants_spoke_about_option = ifelse(is.na(participants_spoke_about_option), FALSE, participants_spoke_about_option))

# Ind-level discuss_obs ---------------------------------------------------------------

# Get discuss_obs to the person level
discuss_obs_person <- discuss_obs %>%
  arrange(group_id) %>%
  select(group_id, group_obs_trans_included, phase, group_label, discuss_type, round, spoke_first, dominant) %>%
  mutate(dominant = str_split(dominant, " ")) %>%
  left_join(df %>% select(group_id, ind_id)) %>%
  mutate(ind_id_in_group = ind_id %>% str_replace(group_id, "") %>% str_replace("_", "") %>% as.numeric()) %>%
  mutate(
    spoke_first = ind_id_in_group == spoke_first,
    dominant = map2_lgl(ind_id_in_group, dominant, ~ as.character(.x) %in% .y)
  ) %>%
  group_by(group_id, ind_id, ind_id_in_group, phase, group_label, discuss_type) %>%
  summarise(
    p_spoke_first = mean_na(spoke_first),
    p_dominant = mean_na(dominant),
    p_spoke_first_trans = mean_na(spoke_first[group_obs_trans_included == TRUE]),
    p_dominant_trans = mean_na(dominant[group_obs_trans_included == TRUE]),
    p_spoke_first_nontrans = mean_na(spoke_first[group_obs_trans_included == FALSE]),
    p_dominant_nontrans = mean_na(dominant[group_obs_trans_included == FALSE]),
  ) %>%

  ungroup %>%
  mutate(
    across(
      c(p_spoke_first, p_dominant, p_spoke_first_trans, p_dominant_trans, p_spoke_first_nontrans, p_dominant_nontrans),
      list(z = z_calc_std)
    )
  ) %>%
  mutate(
    overall_dominance = p_spoke_first_z + p_dominant_z,
    overall_dominance_trans = p_spoke_first_trans_z + p_dominant_trans_z,
    overall_dominance_nontrans = p_spoke_first_nontrans_z + p_dominant_nontrans_z
  ) %>%

  group_by(group_id) %>%
  mutate(
    overall_dominance_others = list(overall_dominance),
    overall_dominance_others = map2(overall_dominance_others, ind_id_in_group, ~ .x[-.y]),
    overall_dominance_trans_others = list(overall_dominance_trans),
    overall_dominance_trans_others = map2(overall_dominance_trans_others, ind_id_in_group, ~ .x[-.y]),
  ) %>%
  ungroup %>%
  mutate(
    overall_dominance_others_mean = map_dbl(overall_dominance_others, mean_na),
    overall_dominance_trans_others_mean = map_dbl(overall_dominance_trans_others, mean_na)
  )


r2_with_discuss_obs <- r2_choices_num %>%
  left_join(discuss_obs_person %>% select(-group_label, -phase, -discuss_type),
            by = c("ind_id", "group_id"))
# Listener observations ---------------------------------------------------------------

listener_obs <- df %>%
  filter(discuss_type == "discussion_pair", group_role == 2) %>%
  mutate(
    listener_spoke_about_options = str_detect(gobs_pair4, "1|3") |
      str_detect(gobs_pair4_other,
                 vars_to_regex(c(
                   "Comments pannanga",
                   "Delivery athigam nu sollittanga"
                 ))),
    listener_spoke_about_options = ifelse(is.na(listener_spoke_about_options), FALSE, listener_spoke_about_options)
  )


# Person-wise discussion observations ---------------------------------------------------------------

# General attitude in discussion
gen_att_in_discussion <- df %>%
  mutate(across(c(gobs9_1, gobs9_2, gobs9_3),
                ~ dplyr::recode(.x, `1` = -2, `2` = -1, `3` = 1, `4` = 2, `50` = 0, `0` = 0))) %>%
  select(group_id, discuss_type, listener_id, matches("group_person_att_i"), matches("gobs9"), matches("group_role")) %>%
  filter(group_role == 2, discuss_type %in% c("discussion_full", "discussion_pair")) %>%
  pivot_longer(-c(group_id, group_role, discuss_type, listener_id),
               names_pattern = "(.*)_(\\d)$",
               names_to = c(".value", "person")) %>%

  mutate(
    ind_id = str_glue("{group_id}_{person}")
  ) %>%

  # Calculate Z-score
  group_by(discuss_type) %>%
  mutate(gobs9_z = z_calc_std(gobs9)) %>%

  group_by(group_id) %>%
  mutate(gobs9_z_group = mean_na(gobs9_z),
         gobs9_z_group_excl = mean_exclude(gobs9_z)) %>%
  ungroup

dups_report(gen_att_in_discussion, ind_id)
dups_stop(gen_att_in_discussion, ind_id)

# in_list
spoke_in_favour_discuss <- discuss_obs %>%
  select(group_id, round, discuss_type, listener_id, who_spoke_pro_trans) %>%
  tidyr::crossing(person = 1:3) %>%

  mutate(who_spoke_pro_trans_split = split_multiple(who_spoke_pro_trans)) %>%
  mutate(spoke_pro_trans = ifelse(is.na(who_spoke_pro_trans), NA, in_list(person, who_spoke_pro_trans_split))) %>%
  tidylog::mutate(spoke_pro_trans = ifelse(!is.na(listener_id) & person == listener_id, NA, spoke_pro_trans)) %>%

  group_by(group_id, person, discuss_type) %>%
  summarise(spoke_pro_trans = mean_na(spoke_pro_trans)) %>%

  # Calculate z-score
  group_by(discuss_type) %>%
  mutate(spoke_pro_trans_z = z_calc_std(spoke_pro_trans)) %>%

  group_by(group_id) %>%
  mutate(

    spoke_pro_trans_group = mean_na(spoke_pro_trans),
    spoke_pro_trans_group_excl = mean_exclude(spoke_pro_trans),
    spoke_pro_trans_z_group = mean_na(spoke_pro_trans_z),
    spoke_pro_trans_z_group_excl = mean_exclude(spoke_pro_trans_z)
  ) %>%
  mutate(ind_id = str_glue("{group_id}_{person}")) %>%
  ungroup

# Get group level average r1 discussed for 2-person discussions
r1_amount_discussed <- discuss_obs %>%
  select(group_id, round, group_obs_trans_included, amount_discussed, amount_discussed_z) %>%
  group_by(group_id, group_obs_trans_included) %>%
  summarise(amount_discussed_z = mean_na(amount_discussed_z),
            amount_discussed = mean_na(amount_discussed)) %>%
  mutate(trans_label = ifelse(group_obs_trans_included, "trans", "non_trans")) %>%
  select(-group_obs_trans_included) %>%
  pivot_wider(names_from = trans_label,
            values_from = c(amount_discussed, amount_discussed_z)) %>%
  print

r2_listener_w_amount_discussed <- r2_choices_num %>%
  filter(is_listener == 1) %>%
  tidylog::left_join(r1_amount_discussed, by = "group_id") %>%
  mutate(
    amount_discussed_z_trans = z_calc_std(amount_discussed_z_trans),
    amount_discussed_z_non_trans = z_calc_std(amount_discussed_z_non_trans)
  )

spoke_in_favour_discuss %>% dups_report(ind_id)
spoke_in_favour_discuss %>% dups_stop(ind_id)

# r1_n_choose_trans
r2_w_discuss_het <- r2_choices %>%
  tidylog::left_join(gen_att_in_discussion, by = "ind_id", suffix = c("", "_extra")) %>%
  tidylog::left_join(spoke_in_favour_discuss, by = "ind_id", suffix = c("", "_extra")) %>%
  tidylog::left_join(discuss_obs_per_group %>% select(group_id, p_pos_mentions, p_neg_mentions), by = c("group_id")) %>%
  tidylog::left_join(r1_n_choose_trans, by = "ind_id", suffix = c("", "_extra")) %>%
  tidylog::left_join(r1_amount_discussed, by = "group_id", suffix = c("", "_extra")) %>%
  mutate(
    favourable_discussion_z = ((spoke_pro_trans_z + gobs9_z) / 2),
    favourable_discussion_z_group_excl = ((spoke_pro_trans_z_group_excl + gobs9_z_group_excl) / 2)
  )

r2_w_discuss_het %>%
  select(spoke_pro_trans, gobs9_z, gobs9_z_group_excl, favourable_discussion_z, favourable_discussion_z_group_excl) %>%
  count_nas()

# Reasons for choices in discussion ---------------------------------------------------------------

harmonise_reasons_collapse <- read_csv("data/raw/harmonise_reasons_collapse.csv") %>%
  mutate(reason_text = remove_emojis(reason_text) %>%
    str_replace("^__", ""))

discuss_obs$reasons_text_list[[1]] %>% print

reasons_discussion <- discuss_obs %>%
  expand_select_multiple(reasons_val_list) %>%
  glimpse %>%

  pivot_longer(matches("reasons_val_list_"), names_prefix = "reasons_val_list_",
               names_to = "reason_val", values_to = "reason_selected") %>%
  mutate(reason_val = str_replace(reason_val, "_", "-") %>% as.numeric()) %>%
  left_join(
    read_csv("data/raw/harmonise_reasons.csv") %>% select(reason_val_phase_2, reason_text_phase_2) %>% distinct(),
    by = c("reason_val" = "reason_val_phase_2")
  ) %>%
  rename(reason_text = reason_text_phase_2) %>%
  mutate(reason_text = remove_emojis(reason_text))

# Amalgamate to individual level

reasons_discussion_group <- reasons_discussion %>%
  glimpse %>%
  group_by(group_id, reason_val, reason_text) %>%
  summarise(reason_heard_r1 = sum_na(reason_selected) > 0)

reasons_discussion_group_by_trans <- reasons_discussion %>%
  glimpse %>%
  group_by(group_id, reason_val, reason_text, group_obs_trans_included) %>%
  summarise(reason_heard_r1_by_gender = sum_na(reason_selected) > 0) %>%
  mutate(
    trans = ifelse(group_obs_trans_included, "trans", "non_trans")
  ) %>%
  select(-group_obs_trans_included) %>%
  pivot_wider(names_from = trans,
              names_prefix = "reason_heard_r1_",
              values_from = reason_heard_r1_by_gender) %>%
  print

# Reasons for choices [detail / pairwise] ---------------------------------------------------------------

regex_reasons <- "reasons_.*_\\d_\\d$|reasons_hiring_choice_\\d$|f2_\\d$|f2_other_\\d$"

reasons_for_choices <- df %>%
  filter(phase == "phase_2") %>%
  select(all_of(id_vars), matches(regex_reasons)) %>%
  pivot_longer(
    matches(regex_reasons),
    names_pattern = "(.*)_(\\d)$",
    names_to = c(".value", "round")
  ) %>%
  mutate(pair_includes_trans = str_detect(reasons_photo_1, "^T") | str_detect(reasons_photo_2, "^T")) %>%
  mutate(
    which_photo_is_trans = case_when(
      str_detect(reasons_photo_1, "^T") ~ 1L,
      str_detect(reasons_photo_2, "^T") ~ 2L,
      TRUE ~ NA_integer_
    )
  ) %>%
  mutate(chose_trans = reasons_hiring_choice == which_photo_is_trans) %>%
  mutate(
    f2_list = str_split(f2, " ")
  ) %>%
  expand_select_multiple(f2_list) %>%
  pivot_longer(
    matches("f2_list_(.*)"),
    names_pattern = "f2_list_(.*)",
    names_to = "f2_val",
    values_to = "reason_selected"
  ) %>%
  mutate(f2_val = f2_val %>% str_replace("_", "-") %>% as.numeric()) %>%
  left_join(
    read_csv("data/raw/harmonise_reasons.csv") %>% select(reason_val_phase_2, reason_text_phase_2) %>% distinct(),
    by = c("f2_val" = "reason_val_phase_2")
  ) %>%
  rename(reason_val = f2_val,
         reason_text = reason_text_phase_2) %>%
  mutate(reason_text = remove_emojis(reason_text)) %>%
  mutate(
    pair_includes_trans_label = ifelse(pair_includes_trans, "Worker is trans", "Worker is non-trans")
  ) %>%
  tidylog::left_join(harmonise_reasons_collapse,
                     by = "reason_text")

# combine R1 and R2 reasons ---------------------------------------------------------------
combined_reasons <- left_join(
  reasons_for_choices,
  reasons_discussion_group,
  by = c("group_id", "reason_val", "reason_text")
) %>%
  left_join(
    reasons_discussion_group_by_trans,
    by = c("group_id", "reason_val", "reason_text")
  )

reasons_discussion_group %>% ungroup %>% select(group_id) %>% distinct

# Salience ---------------------------------------------------------------
salience <- df %>%
  filter(phase == "phase_1") %>%
  select(all_of(id_vars), ind_id, arm_label, group_label, phase, group_id, group_video, group, matches("video_type"), stratum_id, matches("salience_\\d$")) %>%
  pivot_longer(matches("salience_\\d")) %>%
  mutate(salience_vals = str_split(value, " "),
         n_salience = map_int(salience_vals, length)) %>%
  mutate(n_salience = ifelse(value == "0", 0, n_salience)) %>%
  mutate(missing = value == "-99") %>% # missing salience

  group_by(across(phase:video_type_treatment)) %>%
  summarise(trans_remembered = any(name == "salience_2" & map_lgl(salience_vals, ~ "6" %in% .x)),
            n_salience_minus_trans = sum(n_salience) - as.numeric(trans_remembered),
            prop_salience_minus_trans = n_salience_minus_trans / 17,
            missing = any(missing)) %>%
  ungroup %>%
  tidylog::mutate(
    across(c(trans_remembered, n_salience_minus_trans, prop_salience_minus_trans), ~ ifelse(missing, NA, .x))
  ) %>%
  tidylog::mutate(
    across(c(trans_remembered, n_salience_minus_trans, prop_salience_minus_trans), ~ ifelse(missing, median_na(.x), .x))
  ) %>%
  tidylog::left_join(control_vars_df, by = "ind_id") %>%
  mutate(
    across(c(video_type_placebo, video_type_treatment, video_type_control),
           as.numeric)
  )

salience %>% dups_report(ind_id)

r2_with_salience <- r2_choices %>%
  left_join(salience %>% select(ind_id, trans_remembered, prop_salience_minus_trans), by = "ind_id") %>%
  mutate(
    above_median_prop_salience = as.numeric(prop_salience_minus_trans >= median_na(prop_salience_minus_trans)),
    trans_remembered = as.numeric(trans_remembered)
  )

# Perceived purpose ---------------------------------------------------------------

purpose <- df %>%
  mutate(
    purpose_0_list = str_split(purpose_0, " "),
    purpose_2_list = str_split(purpose_2, " "),
    purpose_0_label_list = str_split(purpose_0_label, ";"),
    purpose_2_label_list = str_split(purpose_2_label, ";")
  ) %>%
  mutate(
    purpose_0_trans = map_lgl(purpose_0_list, ~ "8" %in% .x),
    purpose_2_trans = map_lgl(purpose_2_list, ~ "8" %in% .x)
  ) %>%
  mutate(arm_label = fct_relevel(arm_label, "control+no_discuss"))
r2_with_purpose <- r2_choices %>%

  left_join(purpose %>% select(ind_id, purpose_0_trans, purpose_2_trans)) %>%
  mutate(purpose_0_trans = as.numeric(purpose_0_trans),
         purpose_2_trans = as.numeric(purpose_2_trans))
# SDB ---------------------------------------------------------------
sdb_answers <- read_excel("data/raw/other_data_inputs.xlsx", "sdb")

sdb_rev <- df %>%
  select(KEY, ind_id, matches("sdb_\\d\\d?$")) %>%
  select(-matches("\\[")) %>%
  mutate(across(-c(KEY, ind_id), ~.x-1))

sdb_long <- prep_df(sdb_rev,
                    var_regex = "sdb_\\d\\d?$",
                    rev_vec = sdb_answers %>% filter(desirable_answer==1) %>% pull(name),
                    max_val = 1,
                    min_val = 0,
                    na_vals = -100
)
sdb_loadings <- sdb_long %>% make_wide(val_rev) %>% factor_loadings(1) %>%
  rename(loading = MR1)

sdb_loadings %>%
  mutate(var_i = str_replace_all(var, "sdb_|_REV", "") %>% as.integer) %>%
  arrange(var_i) %>%
  pull(loading) %>%
  round(2) %>%
  paste(collapse = ", ") %>%
  paste0("(", ., ")") %>%
  writeLines("outputs/stats/sdb_loadings.tex")

# Inverse cov
sdb_inv_cov <- sdb_long %>% select(KEY, name, val_rev) %>%
  pivot_wider(names_from = name, values_from = val_rev) %>%
  mutate(weight = 1) %>%
  add_inverse_cov_index(-KEY, var = "sdb_inv_cov", weight_var = weight)

sdb_score <- sdb_long %>%
  left_join(sdb_loadings, by = c("name" = "var")) %>%
  mutate(loading = ifelse(loading >= 0.3, loading, 0)) %>% # set low loadings to 0
  group_by(KEY) %>%
  summarise(sdb_score = sum_na(val_rev),
            sdb_score_acqui = sum_na(val_acqui),
            sdb_score_fact = sum_na(val_rev * loading)) %>%
  left_join(sdb_inv_cov %>% select(KEY, sdb_inv_cov))

df_with_sdb <- df %>%
  left_join(sdb_score %>% select(KEY, sdb_score))

r2_with_sdb <- r2_choices %>%
  left_join(sdb_score, by = "KEY") %>%
  mutate(
    high_sdb = sdb_score >= median(sdb_score, na.rm = TRUE),
    high_sdb_acqui = sdb_score_acqui >= median(sdb_score_acqui, na.rm = TRUE),
    high_sdb_fact = sdb_score_fact >= median(sdb_score_fact, na.rm = TRUE),
    high_sdb_inv_cov = sdb_inv_cov >= median(sdb_inv_cov, na.rm = TRUE)
  ) %>%
  mutate(
    across(matches("high_sdb"), as.numeric)
  )

# Item values ---------------------------------------------------------------
median_hh_exp <- df %>%
  mutate(hh_exp_daily = (b14 / b13) / (365.25/12)) %>%
  summarise(hh_exp_daily_med = median(hh_exp_daily))

median_hh_exp %>%
  round(0) %>%
  as.character() %>%
  writeLines("outputs/stats/hh_exp_daily_med.tex")

# monthly food expenditure
oecd_ppp_factor <- 22.882

df %>%
  pull(hh_food_exp_pc) %>%
  median(na.rm = TRUE) %>%
{. / oecd_ppp_factor}

# Median item cost
item_medians <- df %>% summarise(across(c(f5.1, f5.2, f5.3), mean_na))
item_set_medians <- tibble(
  n_items = c(1, 2, 3),
  items_value_perc = c(
    item_medians$f5.1,
    item_medians$f5.1 + item_medians$f5.2,
    item_medians$f5.1 + item_medians$f5.2 + item_medians$f5.3
  ),
  items_value = c(
    68,
    68 + 86,
    68 + 86 + 86
  )
) %>%
  mutate(
    items_value_perc_rel = items_value_perc / median_hh_exp$hh_exp_daily_med,
    items_value_rel = items_value / median_hh_exp$hh_exp_daily_med
  )

item_set_medians %>% pull(items_value_rel) %>% .[[1]] %>% {. * 100} %>% round(0) %>% as.character() %>%
  paste0(., "\\%%") %>%
  writeLines("outputs/stats/item_set_1_value.tex")

item_set_medians %>% pull(items_value_rel) %>% .[[2]] %>% {. * 100} %>% round(0) %>% as.character() %>%
  paste0(., "\\%%") %>%
  writeLines("outputs/stats/item_set_2_value.tex")

item_set_medians %>% pull(items_value_rel) %>% .[[3]] %>% {. * 100} %>% round(0) %>% as.character() %>%
  paste0(., "\\%%") %>%
  writeLines("outputs/stats/item_set_3_value.tex")

r2_n <- r2_choices %>%
  mutate(discuss = group == 1) %>%
  group_by(pair_includes_trans, discuss_type) %>%
  summarise(
    n_obs = n(),
    n_people = n_distinct(KEY)
  ) %>%
  print


r2_choices_with_item_val <- r2_choices_num %>% 
  tidylog::left_join(item_set_medians, by = c("r2_items_1" = "n_items")) %>%
  rename(items_value_1 = items_value,
         items_value_rel_1 = items_value_rel) %>%
  tidylog::left_join(item_set_medians, by = c("r2_items_2" = "n_items")) %>%
  rename(items_value_2 = items_value,
         items_value_rel_2 = items_value_rel) %>%
  mutate(
    item_diff_value =items_value_2 - items_value_1,
    item_diff_value_rel = items_value_rel_2 - items_value_rel_1
  ) %>%

  mutate(
    across(
      c(item_diff_value, item_diff_value_rel),
      ~ifelse(comparator_order_in_pair == 1, -.x, .x)
    )
  ) %>%
  mutate(
    item_diff_value_100 = item_diff_value / 100
  ) %>%
  mutate(discuss = group == 1) %>%
  left_join(r2_n) %>%
  mutate(
    discuss_type_label = discuss_type_label %>% fct_label_append(lab_append = paste0("\n(N=", n_people, ")")),

    pair_includes_trans_label = ifelse(pair_includes_trans, "Worker is trans", "Worker is non-trans")
  )

# MORE GRANULAR
item_diff_summ <- r2_choices_with_item_val %>%
  group_by(
    item_diff_value, pair_includes_trans_label, discuss_type_label, discuss_type
  ) %>%
  summarise(mean_cl_boot(as.numeric(r2_choose_comparator)))

# R1

r1_choices_with_item_val <- r1_choices %>% 
  tidylog::left_join(item_set_medians, by = c("r1_items_1" = "n_items")) %>%
  rename(items_value_1 = items_value,
         items_value_rel_1 = items_value_rel) %>%
  tidylog::left_join(item_set_medians, by = c("r1_items_2" = "n_items")) %>%
  rename(items_value_2 = items_value,
         items_value_rel_2 = items_value_rel) %>%
  mutate(
    item_diff_value =items_value_2 - items_value_1,
    item_diff_value_rel = items_value_rel_2 - items_value_rel_1
  ) %>%
  mutate(
    across(
      c(item_diff_value, item_diff_value_rel),
      ~ifelse(comparator_order_in_pair == 1, -.x, .x)
    )
  ) %>%
  mutate(
    item_diff_value_100 = item_diff_value / 100
  ) %>%
  mutate(discuss = group == 1)

# Anonymous choices ---------------------------------------------------------------

set.seed(12345)
anon_choices <- df %>%
  select(all_of(id_vars), discuss_type, video_type_control, video_type_placebo, video_type_treatment, discussion_full, treat_type_r2, matches("an_choice|anon_photo")) %>%
  filter(phase == "phase_2") %>%
  pivot_longer(
    matches("(an_choice|anon_photo_\\d)_(\\d)"),
    names_pattern = "(an_choice|anon_photo_\\d)_(\\d)",
    names_to = c(".value", "round")
  ) %>%
  mutate(
    trans_1 = anon_photo_1 %>% str_detect("^T"),
    trans_2 = anon_photo_2 %>% str_detect("^T"),
    anon_trans_i = case_when(trans_1 ~ 1, trans_2 ~ 2),
    woman_1 = anon_photo_1 %>% str_detect("^W"),
    woman_2 = anon_photo_2 %>% str_detect("^W"),
    anon_woman_i = case_when(woman_1 ~ 1, woman_2 ~ 2),
    comparator_mm = sample(1:2, replace = TRUE, size = n()),
    comparator_i = coalesce(anon_trans_i, anon_woman_i, comparator_mm)
  ) %>%
  mutate(
    anon_choose_trans = anon_trans_i == an_choice,
    anon_choose_comparator = comparator_i == an_choice,
    pair_includes_trans = !is.na(anon_trans_i)
  ) %>%
  mutate(pair_includes_trans_alt = pair_includes_trans) %>%
  mutate(
    across(c(discussion_full, pair_includes_trans, pair_includes_trans_alt),
           as.numeric)
  ) %>%
  left_join(control_vars_df, by = "ind_id")

anon_choices %>% glimpse

# Order effects ---------------------------------------------------------------

order_effects_df <- bind_rows(
  r1 = r1_choices %>% select(-group_label),
  r2 = r2_choices %>% mutate(round = round + 4),

) %>%
  mutate(
    choose_comparator = coalesce(r1_choose_comparator, r2_choose_comparator)
  ) %>%
  filter(as.numeric(pair_includes_trans) == 1) %>%
  arrange(ind_id, round) %>%
  group_by(ind_id) %>%
  mutate(trans_round = row_number() %>% factor()) %>% ungroup %>%
  mutate(round_type = case_when(
    trans_round %in% 1:2 ~ "Treatment round",
    trans_round %in% 3:4 ~ "Outcome round"
  ) %>% fct_relevel("Treatment round")) %>%
  mutate(discussion_full = discuss_type %in% "discussion_full") %>%
  mutate(
    round_1 = as.numeric(trans_round == 1),
    round_2 = as.numeric(trans_round == 2),
    round_3 = as.numeric(trans_round == 3),
    round_4 = as.numeric(trans_round == 4),
    eff_round_1 = round_1 * discussion_full,
    eff_round_2 = round_2 * discussion_full,
    eff_round_3 = round_3 * discussion_full,
    eff_round_4 = round_4 * discussion_full
  )

# Dominated / non-dominated ---------------------------------------------------------------

r2_dominates <- r2_choices_num %>%
  mutate(
    dominate_status = case_when(
      r2_reliability_shown == 1 & r2_reliability_diff > 0 & item_diff > 0 ~ "dominates",
      r2_reliability_shown == 1 & r2_reliability_diff == 0 & item_diff > 0 ~ "dominates",
      r2_reliability_shown == 1 & r2_reliability_diff > 0 & item_diff == 0 ~ "dominates",
      r2_reliability_shown == 1 & r2_reliability_diff < 0 & item_diff < 0 ~ "dominated",
      r2_reliability_shown == 1 & r2_reliability_diff == 0 & item_diff < 0 ~ "dominated",
      r2_reliability_shown == 1 & r2_reliability_diff < 0 & item_diff == 0 ~ "dominated",
      r2_reliability_shown == 0 & item_diff > 0 ~ "dominates",
      r2_reliability_shown == 0 & item_diff < 0 ~ "dominated",

      TRUE ~ "incomparable"
    ) %>%
      fct_relevel(c("dominated", "incomparable", "dominates"))
  ) %>%
  mutate(
    dominate_status_label = case_when(
      dominate_status == "dominated" ~ "Dominated",
      dominate_status == "incomparable" ~ "Neither\ndominates",
      dominate_status == "dominates" ~ "Dominates"
    ) %>% fct_relevel(c("Dominated", "Neither\ndominates", "Dominates"))
  ) %>%

  mutate(dominates = as.numeric(dominate_status == "dominates"),
         dominated = as.numeric(dominate_status == "dominated"),
         incomparable = as.numeric(dominate_status == "incomparable"))


# R1 - dominated dominating ---------------------------------------------------------------
r1_dominates <- r1_choices %>%
  mutate(
    dominate_status = case_when(
      quality_diff > 0 & item_diff > 0 ~ "dominates",
      quality_diff == 0 & item_diff > 0 ~ "dominates",
      quality_diff > 0 & item_diff == 0 ~ "dominates",

      quality_diff < 0 & item_diff < 0 ~ "dominated",
      quality_diff == 0 & item_diff < 0 ~ "dominated",
      quality_diff < 0 & item_diff == 0 ~ "dominated",

      TRUE ~ "incomparable"
    ) %>%
      fct_relevel(c("dominated", "incomparable", "dominates"))
  ) %>%
  mutate(
    dominate_status_label = case_when(
      dominate_status == "dominated" ~ "Dominated",
      dominate_status == "incomparable" ~ "Neither\ndominates",
      dominate_status == "dominates" ~ "Dominates"
    ) %>% fct_relevel(c("Dominated", "Neither\ndominates", "Dominates"))
  ) %>%
  mutate(dominates = as.numeric(dominate_status == "dominates"),
         dominated = as.numeric(dominate_status == "dominated"),
         incomparable = as.numeric(dominate_status == "incomparable"))

r1_dominates_group_id <- r1_dominates %>%
  filter(discuss_type %in% c("discussion_full", "discussion_pair")) %>%
  filter(group_role == 1) %>%
  select(-matches("ind_id"))

# Get at individual level
r1_dominates_ind <- r1_dominates %>%
  group_by(ind_id, group_id, pair_includes_trans) %>%
  summarise(
    r1_n_dominates = sum_na(dominates),
    r1_n_incomparable = sum_na(incomparable),
    r1_n_dominated = sum_na(dominated)
  ) %>%
  filter(pair_includes_trans == 1) %>%
  mutate(
    r1_n_dominates_pos = if_else(r1_n_dominates > 0, 1, 0),
    r1_n_incomparable_pos = if_else(r1_n_incomparable > 0, 1, 0),
    r1_n_dominated_pos = if_else(r1_n_dominated > 0, 1, 0)
  ) %>%
  select(-pair_includes_trans) %>%
  print

r1_r2_dominates <- r2_dominates %>%
  tidylog::left_join(r1_dominates_ind, by = c("ind_id", "group_id"), suffix = c("", "_extra"))

# Announcements ---------------------------------------------------------------

announce_regex <- "(r1_discuss_announce_.*|r1_choice_other\\d_num|r1_choice_backup_other\\d_num)_(\\d)"

df_announcements <- df %>%
  filter(
    (discuss_type == "choosing_only" & announce_before == 1),
    phase == "phase_2"
  ) %>%
  select(ind_id, discuss_type, matches(announce_regex)) %>%

  pivot_longer(matches(announce_regex), names_pattern = announce_regex, names_to = c(".value", "round")) %>%
  mutate(
    pair_includes_trans = r1_discuss_announce_trans_1
  ) %>%
  glimpse %>%
  mutate(
    r1_choice_other1_num = coalesce(r1_choice_other1_num, r1_choice_backup_other1_num),
    r1_choice_other2_num = coalesce(r1_choice_other2_num, r1_choice_backup_other2_num),
    which_photo_is_trans = case_when(
      str_detect(r1_discuss_announce_photo_1, "^T") ~ 1L,
      str_detect(r1_discuss_announce_photo_2, "^T") ~ 2L,
      TRUE ~ NA_integer_
    ),
    other1_selected_trans = r1_choice_other1_num == which_photo_is_trans,
    other2_selected_trans = r1_choice_other2_num == which_photo_is_trans
  ) %>%
  glimpse

# Amalgamate to individual level
df_announcements_ind <- df_announcements %>%
  group_by(ind_id, discuss_type) %>%

  summarise(p_other_selected_trans = mean_na(other1_selected_trans + other2_selected_trans) / 2) %>%
  ungroup

# Get own R1 choices amalgamated to individual level
r1_choices_ind <- r1_choices %>%
  group_by(group_id, ind_id) %>%
  summarise(p_self_selected_trans = mean_na(r1_choose_trans)) %>%

  group_by(group_id) %>%
  mutate(
    p_other_selected_trans_control = (sum_na(p_self_selected_trans) - p_self_selected_trans) / (n() - 1)
  ) %>%
  ungroup


# R1 choices seen by listeners
r1_for_listeners <- r1_n_choose_trans_group %>%
  left_join(df %>% select(ind_id, is_listener)) %>%
  filter(is_listener == 1) %>%
  mutate(
    p_other_selected_trans_listener = n_r1_choose_trans_group / 2
  ) %>%
  glimpse

# Add to r2_choices
r2_with_announce <- r2_choices_num %>%
  left_join(df_announcements_ind, by = c("ind_id", "discuss_type")) %>%
  tidylog::left_join(r1_choices_ind, by = c("group_id", "ind_id")) %>%
  left_join(r1_for_listeners, by = c("group_id", "ind_id"), suffix = c("", "listener")) %>%
  ungroup %>%
  mutate(
    p_self_selected_trans = ifelse(is.na(p_self_selected_trans), median_na(p_self_selected_trans), p_self_selected_trans),
    p_other_selected_trans = coalesce(p_other_selected_trans, p_other_selected_trans_control, p_other_selected_trans_listener)
  ) %>%
  mutate(
    observer = as.numeric(discuss_type == "choosing_only" & announce_before == 1)
  ) %>%
  mutate(
    diff_selected_trans = p_other_selected_trans - p_self_selected_trans
  ) %>%
  mutate(is_listener = ifelse(is.na(is_listener), 0, is_listener)) %>%
  mutate(
    across(c(r2_choose_trans, public_non_observer),
           as.numeric)
  ) %>%
  mutate(control = as.numeric(discuss_type == "control"))

r2_for_asymmetry <- r2_with_announce %>%
  filter(
    pair_includes_trans == 1,
    phase == "phase_2",
    (discuss_type == "discussion_pair" & is_listener == 1 |
      discuss_type == "choosing_only" |
      discuss_type == "control")
      )
r2_for_asymmetry$control

# Follow-up choices ---------------------------------------------------------------
set.seed(12345)
fu_choices <- df_follow_up %>%
  select(group_id, ind_id, phase, stratum_id, discuss_type,
         public, public_observer, public_non_observer, discussion_pair_speaker, discussion_pair_listener, discussion_full, discussion_pooled, control,
         discussion_full, matches("treat_type"), first_survey_date, survey_date, video_type, group_label, matches("follow_up_"), matches("hiring_choice")) %>%
  select(-follow_up_attempted) %>%

  pivot_longer(
    c(matches("follow_up_"), matches("hiring_choice")),
    names_to = c(".value", "round"),
    names_pattern = "(.*)_(\\d$)"
  ) %>%
  mutate(round = as.numeric(round)) %>%
  mutate(
    follow_up_trans = case_when(
      str_detect(follow_up_photo_1, "T") ~ 1L,
      str_detect(follow_up_photo_2, "T") ~ 2L
    ),
    follow_up_female = case_when(
      str_detect(follow_up_photo_1, "W") ~ 1L,
      str_detect(follow_up_photo_2, "W") ~ 2L
    ),
    follow_up_comparator = coalesce(
      follow_up_trans, follow_up_female, sample(c(1, 2), n(), replace = TRUE)
    )
  ) %>%
  mutate(
    follow_up_choose_trans = hiring_choice == follow_up_trans,
    follow_up_choose_comparator = hiring_choice == follow_up_comparator,
    comparator_order_in_pair = follow_up_comparator
  ) %>%
  mutate(
    item_diff = ifelse(follow_up_comparator == 1, follow_up_items_1 - follow_up_items_2, follow_up_items_2 - follow_up_items_1)
  ) %>%
  mutate(pair_includes_trans = !is.na(follow_up_trans)) %>%
  mutate(pair_includes_female = str_detect(follow_up_photo_1, "^W") | str_detect(follow_up_photo_2, "^W")) %>%
  mutate(
    pair_includes_trans_alt = pair_includes_trans
  ) %>%
  select(-discussion_full) %>%
  left_join(
    df %>% select(ind_id, delivery_incentive_exp,
                  discussion_full,
                  discuss_type,
                  stratum_id,
                  treat_type_r2_label,
                  video_type, video_type_control, video_type_placebo, video_type_treatment),
    by = "ind_id",
    suffix = c("_extra", "")
  ) %>%
  mutate(
    across(where(is.logical), as.numeric)
  ) %>%
  mutate(first_survey_date = coalesce(ymd(first_survey_date), dmy(first_survey_date))) %>%
  mutate(diff_date = interval(first_survey_date, survey_date) %/% days(1)) %>%
  mutate(first_survey_date_week_fes = floor_date(first_survey_date, "week", week_start = 1))

# Follow up group predic ----------------------------------------------------------------------------------------------------------------

df_follow_up %>%
  select(matches('group_predic')) %>% glimpse

fu_group_predic <- df_follow_up %>%
  pivot_longer(
    c(
      matches("group_predic_(fu_)?(choice|photo|educ|reliability|trans|age|items)_\\d"),
    ),
    names_pattern = "(group_.*)_(\\d)$",
    names_to = c(".value", "round")
  ) %>%

  mutate(group_predic_fu_trans = case_when(
    str_detect(group_predic_fu_photo_1, "^T") ~ 1,
    str_detect(group_predic_fu_photo_2, "^T") ~ 2
  )) %>%

  mutate(
    item_diff = case_when(group_predic_fu_trans == 2 ~ group_predic_fu_items_2 - group_predic_fu_items_1,
                          group_predic_fu_trans == 1 ~ group_predic_fu_items_1 - group_predic_fu_items_2),
    reliability_diff = case_when(
      group_predic_fu_trans == 2 ~ group_predic_fu_reliability_2 - group_predic_fu_reliability_1,
      group_predic_fu_trans == 1 ~ group_predic_fu_reliability_1 - group_predic_fu_reliability_2
    ),
    reliability_shown = !is.na(reliability_diff),
    reliability_diff = ifelse(!reliability_shown, 0, reliability_diff),

    reliability_benchmark = case_when(
      group_predic_fu_trans == 2 ~ group_predic_fu_reliability_1,
      group_predic_fu_trans == 1 ~ group_predic_fu_reliability_2
    ),

    reliability_benchmark = ifelse(!reliability_shown, 0, reliability_benchmark),
  ) %>%
  mutate(
    group_predic_choice = ifelse(group_predic_choice==-99, NA, group_predic_choice),
    group_predic_choose_trans = as.numeric(group_predic_choice == group_predic_fu_trans)
  ) %>%
  mutate(
    across(where(is.logical), as.numeric)
  )

# Memory check  ---------------------------------------------------------------

mem_check_regex_choosing_only <- "(r1_choice2?_other\\d_num|r1_choice2?_backup_other\\d_num|memcheck_choice_choosing_only|memcheck_choice_choosing_only_label|mem_photo_\\d)_(\\d)"

# 1 = both choose A
# 2 = mix
# 3 = both choose B
mem_correct_alt_vals <-tidyr::crossing(
  memcheck_choice_choosing_only = c(1, 2, 3),
  correct_choices_val = 1:3
) %>%
  mutate(
    mem_correct_alt = c(
      1, 0.5, 0,
      0.5, 1, 0.5,
      0, 0.5, 1
    )
  )


# CHOOSING ONLY
mem_choosing_only <- df %>%
  filter(choosing_only == 1) %>%
  select(all_of(id_vars), formdef_version, discuss_type, announce_before, announce_after,
         matches(mem_check_regex_choosing_only)) %>%
 
  pivot_longer(
    matches(mem_check_regex_choosing_only),
    names_pattern = mem_check_regex_choosing_only,
    names_to = c(".value", "round")
  ) %>%
  pivot_longer(
    matches("r1_choice"),
    names_pattern = "(r1_choice\\d?|r1_choice\\d?_backup)_other(\\d)",
    names_to = c(".value", "other_person")
  ) %>%

  # Amalgamate all the sources:
  mutate(
    actual_choice_person_i = coalesce(r1_choice, r1_choice_backup, r1_choice2, r1_choice2_backup)
  ) %>%
  select(-c(r1_choice, r1_choice_backup, r1_choice2, r1_choice2_backup)) %>%
  # Group back to the pair-level
  group_by(across(c(all_of(id_vars), formdef_version, mem_photo_1, mem_photo_2, ind_id, round, discuss_type, announce_before, announce_after, matches("memcheck_choice")))) %>%
  summarise(correct_choices = list(actual_choice_person_i)) %>%

  mutate(correct_choices_val = map_dbl(correct_choices, ~ {
    if (setequal(.x, c(1, 1))) return(1)
    else if (setequal(.x, c(2, 2))) return(3)
    else if (setequal(.x, c(1, 2))) return(2)
    else return(NA_real_)
  })) %>%

  mutate(
    mem_correct = memcheck_choice_choosing_only == correct_choices_val
  ) %>%
  left_join(mem_correct_alt_vals, by = c("correct_choices_val", "memcheck_choice_choosing_only")) %>%
  ungroup %>%
  select(-mem_correct) %>%
  rename(mem_correct = mem_correct_alt) %>%
  mutate(pair_includes_trans = str_detect(mem_photo_1, "^T") | str_detect(mem_photo_2, "^T")) %>%
  glimpse

# ALTERNATIVE CALCULATION AS A CHECK
df %>%
  filter(choosing_only == 1) %>%
  select(all_of(id_vars), formdef_version, discuss_type, announce_before, announce_after,
         matches("r1_choice2?(_backup)?_other\\d(_num)?"),
         matches("memcheck_choice_choosing_only(_label)?_\\d")) %>%
  pivot_longer(
    -c(all_of(id_vars), formdef_version, discuss_type, announce_before, announce_after),
    names_pattern = "(.*)_(\\d)$",
    names_to = c(".value", "round")
  ) %>%
  mutate(
    across(matches("r1_choice2?_other\\d"),
           ~ str_replace(.x, "நபர் ",  "" ))
  ) %>%
  rowwise() %>%
  mutate(
    r1_choice_list = list(c(r1_choice_other1, r1_choice_other2, r1_choice2_other1, r1_choice2_other2) %>% .[!is.na(.)])
  ) %>%
  mutate(
    correct_choices = case_when(
      setequal(r1_choice_list, c("A", "B")) ~ 2,
      setequal(r1_choice_list, c("A", "A")) ~ 1,
      setequal(r1_choice_list, c("B", "B")) ~ 3,
    )
  ) %>%
  ungroup %>%
  mutate(
    mem_correct = correct_choices == memcheck_choice_choosing_only
  )

mem_check_regex_discuss <- "(group_choice|group_choice_backup|memcheck_choice_d|memcheck_choice_d_speak|mem_photo_\\d)_(\\d)"

# DISCUSSIONS, (full and pair)
# Discussion full - need to take from group_choice or group_choice_backup
mem_discussion <- df %>%
  filter(discussion_full == 1 | discussion_pair == 1) %>%
  select(all_of(id_vars), formdef_version, discuss_type, is_listener,
         matches(mem_check_regex_discuss)) %>%
  select(-matches("\\[")) %>%
  select(-matches("duration")) %>%
  pivot_longer(
    matches(mem_check_regex_discuss),
    names_pattern = mem_check_regex_discuss,
    names_to = c(".value", "round")
  ) %>%
  mutate(
    actual_choice = coalesce(group_choice, group_choice_backup),
    memcheck_choice = coalesce(memcheck_choice_d, memcheck_choice_d_speak)
  ) %>%
  select(-c(group_choice, group_choice_backup, memcheck_choice_d, memcheck_choice_d_speak)) %>%
  mutate(mem_correct = actual_choice == memcheck_choice) %>%
  group_by(discuss_type) %>%
  mutate(pair_includes_trans = str_detect(mem_photo_1, "^T") | str_detect(mem_photo_2, "^T"))

# No discussion:
# take from ind_choice
mem_check_regex_ind <- "(ind_choice|memcheck_choice_ind|mem_ind_photo_\\d)_(\\d)"

mem_ind <- df %>%
  filter(discuss_type == "control") %>%
  select(all_of(id_vars), formdef_version, discuss_type, matches(mem_check_regex_ind)) %>%
  select(-matches("\\[")) %>%
  select(-matches("duration")) %>%
  pivot_longer(
    matches(mem_check_regex_ind),
    names_pattern = mem_check_regex_ind,
    names_to = c(".value", "round")
  ) %>%
  mutate(mem_correct = ind_choice == memcheck_choice_ind) %>%
  group_by(discuss_type) %>%
  mutate(pair_includes_trans = str_detect(mem_ind_photo_1, "^T") | str_detect(mem_ind_photo_2, "^T"))

mem_check_all <- bind_rows(
  mem_choosing_only,
  mem_discussion,
  mem_ind
) %>%
  mutate(is_listener = factor(is_listener),
         announce_after = factor(announce_after)) %>%
  mutate(
    person_category = case_when(
      is_listener == 1 ~ "Listener",
      is_listener == 0 ~ "Discussant"
    )
  ) %>%
  mutate(
    outcome = case_when(
      discuss_type == "control" ~       "P(recalled own choices)",
      discuss_type == "choosing_only" ~ "P(recalled other's choices)",
      discuss_type == "discussion_full" | discuss_type == "discussion_pair" ~ "P(recalled discussion choices)"
    )
  ) %>%
  mutate(
    pair_includes_trans_label = ifelse(pair_includes_trans, "Choice includes trans", "Choice doesn't include trans"),
    pair_includes_trans = as.numeric(pair_includes_trans)
  )


mem_check_summ <- mem_check_all %>% group_by(discuss_type, discuss_type_label, outcome, person_category) %>%
  summarise(
    mean_cl_cluster(mem_correct, group_id)
  )

# Duration of R1 choices - phase 2 ---------------------------------------------------------------
df %>%
  select(matches("duration")) %>%
  select(matches("choice")) %>%
  select(-matches("\\[")) %>%
  glimpse

duration_regex <- "(duration_group_choice|duration_group_choices_speakers|duration_ind_choice|duration_ind_choice_pre)_(\\d)"

# Get all the ind_ids for all non-control groups
group_id_ind_id <- df %>%
  filter(phase == "phase_2" & discuss_type != "control") %>%
  select(group_id, ind_id) %>%
  arrange(group_id, ind_id)


choice_durations <- df %>%
  filter(phase == "phase_2") %>%
  select(group_id, ind_id, all_of(id_vars), group_role, discuss_type, matches(duration_regex)) %>%
  pivot_longer(
    matches(duration_regex),
    names_pattern = duration_regex,
    names_to = c("duration_type", "round")
  ) %>%
  mutate(round = as.numeric(round)) %>%
  filter(
    duration_type == "duration_group_choice" & group_role == 5 & (discuss_type == "discussion_full" | discuss_type == "discussion_pair") |
      duration_type == "duration_group_choices_speakers" & group_role == 1 & discuss_type == "choosing_only" |
      duration_type == "duration_ind_choice" & discuss_type == "control"
  ) %>%
  mutate(
    ind_id = ifelse(discuss_type == "control", ind_id, NA)
  ) %>%
  left_join(group_id_ind_id, by = "group_id", suffix = c("", "_new")) %>%
  mutate(ind_id = coalesce(ind_id, ind_id_new)) %>%
  select(-ind_id_new) %>%
  arrange(group_id, ind_id) %>%
  group_by(ind_id) %>%
  mutate(value = value - lag(value)) %>%
  filter(!is.na(value)) %>%
  ungroup %>%
  tidylog::mutate(duration_choice_winsorised = datawizard::winsorize(value, threshold = 0.01, method = "percentile"))

choice_durations_r2 <- df %>%
  filter(phase == "phase_2") %>%
  select(
    all_of(id_vars),
    matches("(duration_hiring_choice_pre|duration_hiring_choice)_\\d")) %>%
  pivot_longer(
    matches("(duration_hiring_choice_pre|duration_hiring_choice)_\\d"),
    names_pattern = "(duration_hiring_choice_pre|duration_hiring_choice)_(\\d)",
    names_to = c(".value", "round")
  ) %>%
  mutate(round = as.numeric(round)) %>%
  mutate(
    duration_1 = ifelse(round == "1", duration_hiring_choice - duration_hiring_choice_pre, NA),
    duration_others = ifelse(round != "1", duration_hiring_choice - lag(duration_hiring_choice), NA)
  ) %>%
  mutate(
    duration_r2_choice = coalesce(duration_1, duration_others)
  ) %>%
  mutate(choosing_only = discuss_type == "choosing_only") %>%
  mutate(discussion_pooled = discuss_type == "discussion_full" | discuss_type == "discussion_pair") %>%
  mutate(duration_r2_choice_winsorised = datawizard::winsorize(duration_r2_choice, threshold = 0.01, method = "percentile"))

# duration_group_choice_sheet : assistant, discussion_pair
# duration_group_choice_sheet_no_l: assistant, discussion_full or choosing_only

r2_w_durations <- r2_choices_num %>%
  filter(phase == "phase_2") %>%
  tidylog::left_join(choice_durations_r2, by = c("group_id", "ind_id", "round"), suffix = c("", "_extra"))

r1_w_durations <- r1_choices %>%
  tidylog::left_join(choice_durations, by = c("group_id", "ind_id", "round"), suffix = c("", "_extra")) %>%
  mutate(
    duration_choice_winsorised = duration_choice_winsorised / 60
  )
